Question 1: Scatterplots

#Install the needed packages

rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.2
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(forcats)
library(ModelMetrics)
## 
## Attaching package: 'ModelMetrics'
## The following object is masked from 'package:base':
## 
##     kappa
library(modelr)
## 
## Attaching package: 'modelr'
## The following objects are masked from 'package:ModelMetrics':
## 
##     mae, mse, rmse
#Load the data
load("els.RData")

#Summarize Math Scores, the dependent variable

els%>%summarize(mean(bynels2m,na.rm=TRUE))
## # A tibble: 1 x 1
##   `mean(bynels2m, na.rm = TRUE)`
##                            <dbl>
## 1                           45.4
gg<-ggplot(els,aes(x=bynels2m))
gg<-gg+geom_histogram()
gg
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 313 rows containing non-finite values (stat_bin).

gg<-ggplot(els,aes(x=bynels2m))
gg<-gg+geom_density()
gg
## Warning: Removed 313 rows containing non-finite values (stat_density).

#Plotting a simple bivariate regression
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
## 
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.658  -8.801   0.400   9.048  39.042 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.0379     0.0993   453.5   <2e-16 ***
## byses1        7.9535     0.1335    59.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.27 on 15323 degrees of freedom
##   (964 observations deleted due to missingness)
## Multiple R-squared:  0.1882, Adjusted R-squared:  0.1881 
## F-statistic:  3552 on 1 and 15323 DF,  p-value: < 2.2e-16
confint(mod1)
##                 2.5 %    97.5 %
## (Intercept) 44.843273 45.232561
## byses1       7.691958  8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
           geom_point(shape=1)+ #specify points
           geom_smooth(method=lm) #ask for lm line
g1
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).

Question 2: Adjust the scale of variables

# set the intercept of x and y axis at (0,0)
g1 + expand_limits(x=0, y=0)
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).

# change the axis limits
g1 + expand_limits(x=c(0,3), y=c(10, 90))
## Warning: Removed 964 rows containing non-finite values (stat_smooth).

## Warning: Removed 964 rows containing missing values (geom_point).

Question 3: Add a line to best fit your graphic

#Plotting the lm line
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
## 
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.658  -8.801   0.400   9.048  39.042 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.0379     0.0993   453.5   <2e-16 ***
## byses1        7.9535     0.1335    59.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.27 on 15323 degrees of freedom
##   (964 observations deleted due to missingness)
## Multiple R-squared:  0.1882, Adjusted R-squared:  0.1881 
## F-statistic:  3552 on 1 and 15323 DF,  p-value: < 2.2e-16
confint(mod1)
##                 2.5 %    97.5 %
## (Intercept) 44.843273 45.232561
## byses1       7.691958  8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
           geom_point(shape=1)+ #specify points
           geom_smooth(method=lm) #ask for lm line
g1
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).

#Plotting the loess line
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
## 
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.658  -8.801   0.400   9.048  39.042 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.0379     0.0993   453.5   <2e-16 ***
## byses1        7.9535     0.1335    59.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.27 on 15323 degrees of freedom
##   (964 observations deleted due to missingness)
## Multiple R-squared:  0.1882, Adjusted R-squared:  0.1881 
## F-statistic:  3552 on 1 and 15323 DF,  p-value: < 2.2e-16
confint(mod1)
##                 2.5 %    97.5 %
## (Intercept) 44.843273 45.232561
## byses1       7.691958  8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
           geom_point(shape=1)+ #specify points
           geom_smooth(method=loess) #ask for loess line
g1
## Warning: Removed 964 rows containing non-finite values (stat_smooth).

## Warning: Removed 964 rows containing missing values (geom_point).

Question 4: Clean up the labeling for presentation

#Plotting a simple bivariate regression
mod1<-lm(bynels2m~byses1,data=els) #outcome on left (math score), predictor on right (SES)
summary(mod1)
## 
## Call:
## lm(formula = bynels2m ~ byses1, data = els)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.658  -8.801   0.400   9.048  39.042 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.0379     0.0993   453.5   <2e-16 ***
## byses1        7.9535     0.1335    59.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.27 on 15323 degrees of freedom
##   (964 observations deleted due to missingness)
## Multiple R-squared:  0.1882, Adjusted R-squared:  0.1881 
## F-statistic:  3552 on 1 and 15323 DF,  p-value: < 2.2e-16
confint(mod1)
##                 2.5 %    97.5 %
## (Intercept) 44.843273 45.232561
## byses1       7.691958  8.215111
g1<-ggplot(els, aes(x=byses1,y=bynels2m))+ #specify data and x and y
           geom_point(shape=1)+ #specify points
           geom_smooth(method=lm) #ask for lm line

g1 + scale_x_continuous(name="SocioEconomic Status") +
  scale_y_continuous(name="Math Scores")+ 
  geom_point(color = "#800080", size = 1, shape = 23)
## Warning: Removed 964 rows containing non-finite values (stat_smooth).
## Warning: Removed 964 rows containing missing values (geom_point).

## Warning: Removed 964 rows containing missing values (geom_point).

Question 5: Describing the graphic above

There is a positively correlated relationship between Math Score and SocioEconomic Status. As a test taker’s SocioEconomic Status increases, we can predict that in most cases, math scores increase.

Question 6: Create a regression that predicts reading score as a function of SES plus an additional covariate using the training data

# Bring in new data


#Create the plot bringing in another covariate, Parental Ed


g2<-ggplot(data=filter(els,is.na(bypared)==FALSE),
            aes(x=byses1,y=bynels2m,
                color=as.factor(bypared) #setting the color option
                ))
## Let's make the dots smaller for readability
g2<-g2+geom_point(size=.25)
## Changing the Legend
g2<-g2+theme(legend.position="bottom"  )#, legend.title = 
              # element_blank())
g2<-g2+ylab("Test Scores")+xlab("Socio Economic Status")
g2 <- g2+ scale_color_discrete(name="Parental Ed")+
 geom_smooth(method=lm) #ask for lm lines for the range and relationship
 
g2
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing missing values (geom_point).

#Transforming the SES scale from -1 to 1 to a percentage

els<-els%>%mutate(byses_p=percent_rank(byses1)*100)
els%>%summarize(mean(byses_p,na.rm=TRUE))
## # A tibble: 1 x 1
##   `mean(byses_p, na.rm = TRUE)`
##                           <dbl>
## 1                          49.8
g3<-ggplot(data=filter(els,is.na(bypared)==FALSE),
            aes(x=byses1,y=bynels2m,
                color=as.factor(bypared) #setting the color option
                ))
## Let's make the dots smaller for readability
g3<-g2+geom_point(size=.25)
## Changing the Legend
g3<-g2+theme(legend.position="bottom"  )#, legend.title = 
              # element_blank())
g3<-g3+ylab("Test Scores")+xlab("Socio Economic Status")
g3 <- g3+ scale_color_discrete(name="Parental Ed")+
 geom_smooth(method=lm) #ask for lm lines for the range and relationship
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
g3
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing missing values (geom_point).

#Getting the regression line for prediction
mod3<-lm(bynels2m~byses_p+
         as.factor(bypared),
         data=els
         );summary(mod3)
## 
## Call:
## lm(formula = bynels2m ~ byses_p + as.factor(bypared), data = els)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.605  -8.814   0.401   9.135  39.130 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         34.653393   0.405322  85.496   <2e-16 ***
## byses_p              0.216707   0.006004  36.094   <2e-16 ***
## as.factor(bypared)2  0.947310   0.469370   2.018   0.0436 *  
## as.factor(bypared)3 -0.520123   0.536834  -0.969   0.3326    
## as.factor(bypared)4  0.461198   0.549184   0.840   0.4010    
## as.factor(bypared)5 -0.022320   0.549197  -0.041   0.9676    
## as.factor(bypared)6 -0.609384   0.576310  -1.057   0.2904    
## as.factor(bypared)7 -0.295112   0.665803  -0.443   0.6576    
## as.factor(bypared)8 -1.060373   0.737483  -1.438   0.1505    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.28 on 15316 degrees of freedom
##   (964 observations deleted due to missingness)
## Multiple R-squared:  0.1872, Adjusted R-squared:  0.1867 
## F-statistic: 440.8 on 8 and 15316 DF,  p-value: < 2.2e-16
#Let's predict test score with an average SES and Parent Ed of 8

34.653393  + 0.216707 * 8
## [1] 36.38705

Question 7: Report RMSE

rmse_3<-modelr::rmse(mod3,els)
rmse_3
## [1] 12.27983